Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques
options(
digits = 2,
scipen = 999,
warn = -1
)
rm(
list = ls()
)
library(magrittr)
Used for when theres a lot of variables and dont want to go in and name levels manually.
v_data_description <- readr::read_file(
file = "/Users/thienpham/Data Mining/data/house-prices-advanced-regression-techniques/data_description.txt"
) # Gives 1 long string
v_data_description <- stringr::str_split( # splits up the string by "\n" new line
string = v_data_description,
pattern = "\n"
) %>%
unlist()
v_colnames <- grep( # we see that column names are followed by a : so we mark row index of column name
x = v_data_description,
pattern = ":"
)
M_data_description <- data.frame( # then build a dataframe of the column name and its levels
column = NA,
levels = v_data_description
)
for(j in 1:nrow(M_data_description)){ # here we extract what we want (column names)
if(grepl(x = M_data_description$levels[j],pattern = ": \\D")) column <- M_data_description$levels[j]
M_data_description$column[j] <- column
}
M_data_description$levels <- trimws(gsub(
x = M_data_description$levels,
pattern = "\t.*", # \t = tabs in the dataset, . = any character after it, * = 0 or more times
replacement = "" # replace with ""
))
M_data_description <- M_data_description[!grepl( # looking for things that are NOT column names
x = M_data_description$levels,
pattern = ":" # remember : follows behind column names
),]
M_data_description <- M_data_description[M_data_description$levels != "",]
M_data_description$column <- gsub(
x = M_data_description$column,
pattern = ":.*", # deleting everything from : onwards
replacement = "" # replacing with ""
) %>%
trimws() # %>%janitor::make_clean_names()
M_data_description
Row bind depending on data and specific need. Needed for this case.
library(magrittr)
col_types <- paste0(rep("c",82),collapse = "") # read everything in as character to preserve ex: 00934
M_train <- readr::read_csv(
file = "/Users/thienpham/Data Mining/data/house-prices-advanced-regression-techniques/train.csv",
col_types = col_types,
name_repair = "minimal"
) %>%
as.data.frame() %>%
dplyr::mutate(TrainTest = "Train")
M_test <- readr::read_csv(
file = "/Users/thienpham/Data Mining/data/house-prices-advanced-regression-techniques/test.csv",
col_types = col_types,
name_repair = "minimal"
) %>%
as.data.frame() %>%
dplyr::mutate(SalePrice = NA,TrainTest = "Test") # create 2 new columns in test data so everything matches
# up for row binding
M <- dplyr::bind_rows(
M_train,M_test # rowbinds
) %>%
dplyr::mutate( # then turn target variable numeric since it was originally read as character
SalePrice = as.numeric(
x = SalePrice
),
) %>%
dplyr::select(-PoolArea) %>% # essentially constant the almost entire time so useless variable = remove
as.data.frame()
Remeber this makes a new level called “N/A”.
v_class <- sapply(
X = M,
FUN = class
)
for(j in colnames(M)) if(v_class[j] == "character"){
M[is.na(M[,j]),j] <- "N/A"
M[M[,j] == "NA",j] <- "N/A"
}
Sometimes there are levels in the actual data that arent in the data description including the “N/A” level we made.
for(j in unique(M_data_description$column)){
v_levels <- M_data_description$levels[M_data_description$column == j] # take the levels for the jth column
v_levels <- c(
v_levels[v_levels %in% M[,j]],unique(M[!M[,j] %in% v_levels,j]) # then include the levels that are in the actual
) # data that are not in the description
M[,j] <- factor(
x = M[,j],
levels = v_levels
)
}
M$SalePrice <- as.numeric(
x = M$SalePrice
)
v_numeric <- c(
"LotFrontage","LotArea","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1",
"BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF",
"GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr",
"KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars",
"GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch",
"ScreenPorch","MiscVal","MoSold","YrSold"
)
for(j in v_numeric) M[,j] <- as.numeric(
x = M[,j]
)
We visualize our data to
library(ggplot2)
M$Bin <- cut_number( # bin the target variable into 3 different range group for color purposes later
x = M$SalePrice,
n = 3,
closed = "left"
)
M %>%
dplyr::filter(!is.na(SalePrice)) %>% #dplyr::filter subsets a group under the hood for the next step
ggplot() +
aes(x = SalePrice,fill = Bin) +
geom_histogram() +
theme_bw() +
labs(
title = "Histogram of sale prices",
subtitle = "Notice that the tail on the right.\nEach color has 33% of data.",
caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notice our data is right skewed so we want to apply a boxcox transformation to make the data a more symmetric distribution.
# Calculates the BoxCox transform
preProcess_SalePrice <- caret::preProcess(
x = M %>% dplyr::select(SalePrice),
method = "BoxCox"
)
# Performs the BoxCox transform on the target variable
M$BoxCox_SalePrice <- predict(
object= preProcess_SalePrice,
newdata = M
)$SalePrice
M %>%
dplyr::filter(!is.na(SalePrice)) %>%
ggplot() +
aes(x = BoxCox_SalePrice,fill = Bin) +
geom_histogram() +
theme_bw() +
labs(
title = "Histogram of Box-Cox transformed sale prices",
subtitle = "Notice that the histogram is closer to symmetric.\nEach color has 33% of data.",
caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notice the Boxcox transform creates a new variable not actually changing
our target variable.
Use Wilcoxon-signed-rank test p-values to order the categorical predictor variables by impact (first being most impactful). * If the boxes are far apart from each other vertically, then it implies a good predictor variable. * If the boxes are at similar heights then it does not do a good job informing us about the target variable. * The boxes are also color filled in based off of their median value.
v_wilcox.test <- sapply(
X = unique(
x = M_data_description$column
),
FUN = function(j) mean(as.vector(pairwise.wilcox.test(
x = M$SalePrice[!is.na(M$SalePrice)],
g = M[!is.na(M$SalePrice),j]
)$p.value),na.rm = TRUE)
)
v_wilcox.test <- sort( # Sorts the
x = v_wilcox.test
)
library(ggplot2)
for(j in names(v_wilcox.test)){
M_plot <- M[,c("SalePrice",j)] %>%
dplyr::group_by_(j) %>%
dplyr::mutate(median_SalePrice = median(SalePrice,na.rm = TRUE)) %>%
dplyr::ungroup()
p <- ggplot(M_plot) +
aes_string(x = j) +
aes(y = SalePrice,fill = median_SalePrice) +
geom_boxplot(color = "grey50") +
theme_bw() +
labs(
title = paste0("Box plots of sale price by ",j),
subtitle = "Boxes with very different vertical positions give better predictors",
caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
)
plot(p)
}
Prevents outliers from pulling model away from the rest of the data.
# Fixing up the names
colnames(M)[colnames(M) %in% v_numeric] <- janitor::make_clean_names(
string = colnames(M)[colnames(M) %in% v_numeric]
)
v_numeric <- janitor::make_clean_names(
string = v_numeric
)
# Capping extreme values outside of the 99 percentile to the highest capped value at 99 percentile.
# Example: 1, 14,15,16,15,16 100, we will change 100 to 16 and 1 to 14
for(j in v_numeric){
v_cutoffs <- quantile(
x = M[,j],
probs = c(
0.01,0.99
),
na.rm = TRUE
)
M[M[,j] < v_cutoffs[1] & !is.na(M[,j] < v_cutoffs[1]),j] <- v_cutoffs[1]
M[v_cutoffs[2] < M[,j] & !is.na(v_cutoffs[2] < M[,j]),j] <- v_cutoffs[2]
}
During data preparation exploration
v_BoxCox <- c(
"lot_frontage","lot_area","year_built","year_remod_add","x1st_flr_sf","gr_liv_area","tot_rms_abv_grd","garage_yr_blt","mo_sold"
)
v_YeoJohnson <- c(
"mas_vnr_area","bsmt_fin_sf1","bsmt_fin_sf2","bsmt_unf_sf","total_bsmt_sf","x2nd_flr_sf","wood_deck_sf","open_porch_sf","enclosed_porch"
)
v_notransform <- c(
"low_qual_fin_sf","bsmt_half_bath","x3ssn_porch","pool_area","misc_val",
"garage_cars","garage_area","full_bath","fireplaces","half_bath","bedroom_abv_gr","bsmt_full_bath","kitchen_abv_gr","screen_porch","pool_area","yr_sold","bsmt_half_bath"
)
preProcess_BoxCox <- caret::preProcess(
x = M[,v_BoxCox],
method = "BoxCox"
)
preProcess_YeoJohnson <- caret::preProcess(
x = M[,v_YeoJohnson],
method = "YeoJohnson"
)
M[,v_BoxCox] <- predict(
object = preProcess_BoxCox,
newdata = M[,v_BoxCox]
)
M[,v_YeoJohnson] <- predict(
object = preProcess_YeoJohnson,
newdata = M[,v_YeoJohnson]
)
v_cor <- sapply(
X = v_numeric,
FUN = function(j) cor(
x = M[,j],
y = M$BoxCox_SalePrice,
method = "spearman",
use = "pairwise.complete.obs"
)
)
v_cor <- v_cor[order( # orders the correlation from largest absolute value to smallest
x = abs(
x = v_cor
),
decreasing = TRUE
)]
for(j in names(v_cor)){
M_plot <- M[,c("BoxCox_SalePrice","Bin",j)]
p <- ggplot(M_plot[complete.cases(M_plot),]) +
aes_string(x = j) +
aes(y = BoxCox_SalePrice) +
geom_point(aes(color = Bin)) +
geom_smooth(color = "black",se = FALSE,method = "loess") + # loess method was used b/c linear regression was too rigid
theme_bw() +
labs(
title = paste0("Scatter plot of sale price by ",j),
subtitle = paste0("Spearman correlation: ",round(v_cor[j],2),"\nEach color has 33% of the points."),
caption = "Data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques"
)
plot(p)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Normally preprocess function from the caret package would be preferred because every time we get new data we do not have to refit but the missranger package was used here because we are working with both the train and test set which are finalized.
v_predictors <- c(
v_numeric,names(v_wilcox.test)
)
M[,v_predictors] <- missRanger::missRanger(
data = M[,v_predictors],
returnOOB = TRUE,
seed = 823,
maxiter = 100
)
##
## Missing value imputation by random forests
##
## Variables to impute: lot_frontage, mas_vnr_area, bsmt_fin_sf1, bsmt_fin_sf2, bsmt_unf_sf, total_bsmt_sf, bsmt_full_bath, bsmt_half_bath, garage_yr_blt, garage_cars, garage_area
## Variables used to impute: lot_frontage, lot_area, year_built, year_remod_add, mas_vnr_area, bsmt_fin_sf1, bsmt_fin_sf2, bsmt_unf_sf, total_bsmt_sf, x1st_flr_sf, x2nd_flr_sf, low_qual_fin_sf, gr_liv_area, bsmt_full_bath, bsmt_half_bath, full_bath, half_bath, bedroom_abv_gr, kitchen_abv_gr, tot_rms_abv_grd, fireplaces, garage_yr_blt, garage_cars, garage_area, wood_deck_sf, open_porch_sf, enclosed_porch, x3ssn_porch, screen_porch, misc_val, mo_sold, yr_sold, CentralAir, GarageFinish, KitchenQual, ExterQual, PavedDrive, BsmtExposure, BsmtQual, MSZoning, FireplaceQu, BsmtCond, OverallQual, LandContour, Street, MasVnrType, GarageType, HeatingQC, BsmtFinType1, Alley, Neighborhood, LotShape, HouseStyle, BldgType, Fence, Foundation, MSSubClass, SaleCondition, OverallCond, LandSlope, GarageQual, Utilities, ExterCond, GarageCond, Condition1, BsmtFinType2, LotConfig, Functional, Heating, Exterior1st, Electrical, Exterior2nd, SaleType, MiscFeature, PoolQC, RoofStyle, RoofMatl, Condition2
##
## iter 1
##
|
| | 0%
|
|====== | 9%
|
|============= | 18%
|
|=================== | 27%
|
|========================= | 36%
|
|================================ | 45%
|
|====================================== | 55%
|
|============================================= | 64%
|
|=================================================== | 73%
|
|========================================================= | 82%
|
|================================================================ | 91%
|
|======================================================================| 100%
## iter 2
##
|
| | 0%
|
|====== | 9%
|
|============= | 18%
|
|=================== | 27%
|
|========================= | 36%
|
|================================ | 45%
|
|====================================== | 55%
|
|============================================= | 64%
|
|=================================================== | 73%
|
|========================================================= | 82%
|
|================================================================ | 91%
|
|======================================================================| 100%
## iter 3
##
|
| | 0%
|
|====== | 9%
|
|============= | 18%
|
|=================== | 27%
|
|========================= | 36%
|
|================================ | 45%
|
|====================================== | 55%
|
|============================================= | 64%
|
|=================================================== | 73%
|
|========================================================= | 82%
|
|================================================================ | 91%
|
|======================================================================| 100%
The imputation look 3 iterations which is fast, if it took 100 iterations then it is a sign that the imputation model is bad.
During feature selection we will use our association analysis to identify which predictors to keep and which to drop.
For numeric predictors we will use Spearman’s correlation since it is better at detecting non-linear monotonic relationships.
For categorical predictors we will use Cramer’s V in place of correlation.
M_cor <- cor(
x = M[,v_numeric],
method = "spearman",
use = "pairwise.complete.obs"
)
M_CramerV <- DescTools::PairApply(
x = M[,names(v_wilcox.test)],
FUN = DescTools::CramerV
)
corrplot::corrplot(
corr = M_cor,
diag = FALSE,
is.corr = FALSE#, # there was a problem with the pool column for this data set so this portion was excluded
#order = "hclust", # this portion should definitely be ran, as a result we will only get correlation info based on 2 variables
#hclust.method = "ward.D" # instead of an extra piece of information from ordering with hclust which would have told us
) # which group of variables are giving the same information for feature selection removal
corrplot::corrplot(
corr = M_CramerV,
diag = FALSE,
is.corr = FALSE,
order = "hclust",
hclust.method = "ward.D"
)
Notice the squares in the correlation matrix, this means if we remove
one variable, the other variables in that square could pick up its
slack.
Remember wards D gives us more even group for feature selection variable removal and single linkage does a good job at identifying outlier telling us which variables we shouldn’t remove.
plot(
x = hclust(
d = as.dist(
m = 1 - M_cor
),
method = "ward.D"
)
)
If we get rude if lot_frontage, lot_area could cover for it.
plot(
x = hclust(
d = as.dist(
m = 1 - M_cor
),
method = "single"
)
)
Year sold is very different from all of the other predictor variables so
we shouldnt throw it out. ## 14. Variable clustering on the categorical
variables
plot(
x = hclust(
d = as.dist(
m = 1 - M_CramerV
),
method = "ward.D"
)
)
External quality and overall quality contain the same information, we
could remove one or the other.
plot(
x = hclust(
d = as.dist(
m = 1 - M_CramerV
),
method = "single"
)
)
We can see that fence is distinct from all other predictor variables so
we should do not remove it because it contains unique information.
To make model fitting faster, and to prevent over-fitting categorical and ordinal columns will be binned into two levels and represented with a binary column. The binning strategy will try for 50/50 bins. (If dont understand, check this portion on categorical data prep notes 02)
two strategies
v_other <- c(
"CentralAir","PavedDrive","BsmtExposure","MSZoning","BsmtCond","LandContour",
"Street","Alley","BldgType","Fence","SaleCondition","LandSlope","GarageQual",
"Utilities","ExterCond","GarageCond","Condition1","BsmtFinType2","LotConfig",
"Functional","Heating","Electrical","SaleType","MiscFeature","PoolQC",
"RoofStyle","RoofMatl","Condition2"
)
for(j in v_other){
v_table <- sort(
x = table(
x = M[,j]
),
decreasing = TRUE
)
M[,j] <- as.numeric(
x = M[,j] == names(v_table)[1]
)
v_table <- unclass(
x = v_table
)
M_class <- data.frame(
level = names(v_table),
n = v_table,
proportion = prop.table(v_table)
)
print(
x = "---------------------------------------------------------------------"
)
print(
x = j
)
print(
x = M_class
)
}
## [1] "---------------------------------------------------------------------"
## [1] "CentralAir"
## level n proportion
## Y Y 2723 0.933
## N N 196 0.067
## [1] "---------------------------------------------------------------------"
## [1] "PavedDrive"
## level n proportion
## Y Y 2641 0.905
## N N 216 0.074
## P P 62 0.021
## [1] "---------------------------------------------------------------------"
## [1] "BsmtExposure"
## level n proportion
## No No 1904 0.652
## Av Av 418 0.143
## Gd Gd 276 0.095
## Mn Mn 239 0.082
## N/A N/A 82 0.028
## [1] "---------------------------------------------------------------------"
## [1] "MSZoning"
## level n proportion
## RL RL 2265 0.7760
## RM RM 460 0.1576
## FV FV 139 0.0476
## RH RH 26 0.0089
## C (all) C (all) 25 0.0086
## N/A N/A 4 0.0014
## [1] "---------------------------------------------------------------------"
## [1] "BsmtCond"
## level n proportion
## TA TA 2606 0.8928
## Gd Gd 122 0.0418
## Fa Fa 104 0.0356
## N/A N/A 82 0.0281
## Po Po 5 0.0017
## [1] "---------------------------------------------------------------------"
## [1] "LandContour"
## level n proportion
## Lvl Lvl 2622 0.898
## HLS HLS 120 0.041
## Bnk Bnk 117 0.040
## Low Low 60 0.021
## [1] "---------------------------------------------------------------------"
## [1] "Street"
## level n proportion
## Pave Pave 2907 0.9959
## Grvl Grvl 12 0.0041
## [1] "---------------------------------------------------------------------"
## [1] "Alley"
## level n proportion
## N/A N/A 2721 0.932
## Grvl Grvl 120 0.041
## Pave Pave 78 0.027
## [1] "---------------------------------------------------------------------"
## [1] "BldgType"
## level n proportion
## 1Fam 1Fam 2425 0.831
## TwnhsE TwnhsE 227 0.078
## Duplex Duplex 109 0.037
## Twnhs Twnhs 96 0.033
## 2fmCon 2fmCon 62 0.021
## [1] "---------------------------------------------------------------------"
## [1] "Fence"
## level n proportion
## N/A N/A 2348 0.8044
## MnPrv MnPrv 329 0.1127
## GdPrv GdPrv 118 0.0404
## GdWo GdWo 112 0.0384
## MnWw MnWw 12 0.0041
## [1] "---------------------------------------------------------------------"
## [1] "SaleCondition"
## level n proportion
## Normal Normal 2402 0.8229
## Partial Partial 245 0.0839
## Abnorml Abnorml 190 0.0651
## Family Family 46 0.0158
## Alloca Alloca 24 0.0082
## AdjLand AdjLand 12 0.0041
## [1] "---------------------------------------------------------------------"
## [1] "LandSlope"
## level n proportion
## Gtl Gtl 2778 0.9517
## Mod Mod 125 0.0428
## Sev Sev 16 0.0055
## [1] "---------------------------------------------------------------------"
## [1] "GarageQual"
## level n proportion
## TA TA 2604 0.8921
## N/A N/A 159 0.0545
## Fa Fa 124 0.0425
## Gd Gd 24 0.0082
## Po Po 5 0.0017
## Ex Ex 3 0.0010
## [1] "---------------------------------------------------------------------"
## [1] "Utilities"
## level n proportion
## AllPub AllPub 2916 0.99897
## N/A N/A 2 0.00069
## NoSeWa NoSeWa 1 0.00034
## [1] "---------------------------------------------------------------------"
## [1] "ExterCond"
## level n proportion
## TA TA 2538 0.8695
## Gd Gd 299 0.1024
## Fa Fa 67 0.0230
## Ex Ex 12 0.0041
## Po Po 3 0.0010
## [1] "---------------------------------------------------------------------"
## [1] "GarageCond"
## level n proportion
## TA TA 2654 0.9092
## N/A N/A 159 0.0545
## Fa Fa 74 0.0254
## Gd Gd 15 0.0051
## Po Po 14 0.0048
## Ex Ex 3 0.0010
## [1] "---------------------------------------------------------------------"
## [1] "Condition1"
## level n proportion
## Norm Norm 2511 0.8602
## Feedr Feedr 164 0.0562
## Artery Artery 92 0.0315
## RRAn RRAn 50 0.0171
## PosN PosN 39 0.0134
## RRAe RRAe 28 0.0096
## PosA PosA 20 0.0069
## RRNn RRNn 9 0.0031
## RRNe RRNe 6 0.0021
## [1] "---------------------------------------------------------------------"
## [1] "BsmtFinType2"
## level n proportion
## Unf Unf 2493 0.854
## Rec Rec 105 0.036
## LwQ LwQ 87 0.030
## N/A N/A 80 0.027
## BLQ BLQ 68 0.023
## ALQ ALQ 52 0.018
## GLQ GLQ 34 0.012
## [1] "---------------------------------------------------------------------"
## [1] "LotConfig"
## level n proportion
## Inside Inside 2133 0.7307
## Corner Corner 511 0.1751
## CulDSac CulDSac 176 0.0603
## FR2 FR2 85 0.0291
## FR3 FR3 14 0.0048
## [1] "---------------------------------------------------------------------"
## [1] "Functional"
## level n proportion
## Typ Typ 2717 0.93080
## Min2 Min2 70 0.02398
## Min1 Min1 65 0.02227
## Mod Mod 35 0.01199
## Maj1 Maj1 19 0.00651
## Maj2 Maj2 9 0.00308
## Sev Sev 2 0.00069
## N/A N/A 2 0.00069
## [1] "---------------------------------------------------------------------"
## [1] "Heating"
## level n proportion
## GasA GasA 2874 0.98458
## GasW GasW 27 0.00925
## Grav Grav 9 0.00308
## Wall Wall 6 0.00206
## OthW OthW 2 0.00069
## Floor Floor 1 0.00034
## [1] "---------------------------------------------------------------------"
## [1] "Electrical"
## level n proportion
## SBrkr SBrkr 2671 0.91504
## FuseA FuseA 188 0.06441
## FuseF FuseF 50 0.01713
## FuseP FuseP 8 0.00274
## Mix Mix 1 0.00034
## N/A N/A 1 0.00034
## [1] "---------------------------------------------------------------------"
## [1] "SaleType"
## level n proportion
## WD WD 2525 0.86502
## New New 239 0.08188
## COD COD 87 0.02980
## ConLD ConLD 26 0.00891
## CWD CWD 12 0.00411
## ConLI ConLI 9 0.00308
## ConLw ConLw 8 0.00274
## Oth Oth 7 0.00240
## Con Con 5 0.00171
## N/A N/A 1 0.00034
## [1] "---------------------------------------------------------------------"
## [1] "MiscFeature"
## level n proportion
## N/A N/A 2814 0.96403
## Shed Shed 95 0.03255
## Gar2 Gar2 5 0.00171
## Othr Othr 4 0.00137
## TenC TenC 1 0.00034
## [1] "---------------------------------------------------------------------"
## [1] "PoolQC"
## level n proportion
## N/A N/A 2909 0.99657
## Ex Ex 4 0.00137
## Gd Gd 4 0.00137
## Fa Fa 2 0.00069
## [1] "---------------------------------------------------------------------"
## [1] "RoofStyle"
## level n proportion
## Gable Gable 2310 0.7914
## Hip Hip 551 0.1888
## Gambrel Gambrel 22 0.0075
## Flat Flat 20 0.0069
## Mansard Mansard 11 0.0038
## Shed Shed 5 0.0017
## [1] "---------------------------------------------------------------------"
## [1] "RoofMatl"
## level n proportion
## CompShg CompShg 2876 0.98527
## Tar&Grv Tar&Grv 23 0.00788
## WdShake WdShake 9 0.00308
## WdShngl WdShngl 7 0.00240
## ClyTile ClyTile 1 0.00034
## Membran Membran 1 0.00034
## Metal Metal 1 0.00034
## Roll Roll 1 0.00034
## [1] "---------------------------------------------------------------------"
## [1] "Condition2"
## level n proportion
## Norm Norm 2889 0.98972
## Feedr Feedr 13 0.00445
## Artery Artery 5 0.00171
## PosN PosN 4 0.00137
## PosA PosA 4 0.00137
## RRNn RRNn 2 0.00069
## RRAn RRAn 1 0.00034
## RRAe RRAe 1 0.00034
v_mean <- c(
"GarageFinish","KitchenQual","ExterQual","BsmtQual","FireplaceQu",
"OverallQual","MasVnrType","GarageType","HeatingQC","BsmtFinType1",
"Neighborhood","LotShape","HouseStyle","Foundation","MSSubClass",
"OverallCond","Exterior1st","Exterior2nd"
)
for(j in v_mean){
M_BoxCox_SalePrice <- M %>%
dplyr::select_(j,"BoxCox_SalePrice") %>%
dplyr::mutate_(j = as.character(j)) %>%
dplyr::group_by_(j) %>%
dplyr::summarise(BoxCox_SalePrice = mean(BoxCox_SalePrice,na.rm = TRUE),n = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(proportion = n/sum(n)) %>%
dplyr::arrange(BoxCox_SalePrice) %>%
dplyr::mutate(cumsum_ascending = cumsum(proportion)) %>%
dplyr::arrange(dplyr::desc(cumsum_ascending)) %>%
dplyr::mutate(cumsum_descending = 1 - cumsum(proportion)) %>%
dplyr::arrange(cumsum_ascending) %>%
dplyr::mutate(mean_cumsum = (cumsum_ascending + cumsum_descending)/2) %>%
dplyr::arrange(mean_cumsum) %>%
as.data.frame()
v_j <- levels(M_BoxCox_SalePrice[,j])[M_BoxCox_SalePrice$mean_cumsum >= 0.5]
M[,j] <- as.numeric(
x = M[,j] %in% M_BoxCox_SalePrice[M_BoxCox_SalePrice$mean_cumsum >= 0.5,j]
)
print(
x = "---------------------------------------------------------------------"
)
print(
x = j
)
print(knitr::kable(
M_BoxCox_SalePrice
))
}
## [1] "---------------------------------------------------------------------"
## [1] "GarageFinish"
##
##
## |GarageFinish | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:------------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |N/A | 11| 159| 0.05| 0.05| 0.00| 0.03|
## |Unf | 12| 1230| 0.42| 0.48| 0.05| 0.27|
## |RFn | 12| 811| 0.28| 0.75| 0.48| 0.61|
## |Fin | 12| 719| 0.25| 1.00| 0.75| 0.88|
## [1] "---------------------------------------------------------------------"
## [1] "KitchenQual"
##
##
## |KitchenQual | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Fa | 12| 70| 0.02| 0.02| 0.00| 0.01|
## |TA | 12| 1492| 0.51| 0.54| 0.02| 0.28|
## |Gd | 12| 1151| 0.39| 0.93| 0.54| 0.73|
## |Ex | 13| 205| 0.07| 1.00| 0.93| 0.96|
## |N/A | NaN| 1| 0.00| 1.00| 1.00| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "ExterQual"
##
##
## |ExterQual | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Fa | 11| 35| 0.01| 0.01| 0.00| 0.01|
## |TA | 12| 1798| 0.62| 0.63| 0.01| 0.32|
## |Gd | 12| 979| 0.34| 0.96| 0.63| 0.80|
## |Ex | 13| 107| 0.04| 1.00| 0.96| 0.98|
## [1] "---------------------------------------------------------------------"
## [1] "BsmtQual"
##
##
## |BsmtQual | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:--------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |N/A | 12| 81| 0.03| 0.03| 0.00| 0.01|
## |Fa | 12| 88| 0.03| 0.06| 0.03| 0.04|
## |TA | 12| 1283| 0.44| 0.50| 0.06| 0.28|
## |Gd | 12| 1209| 0.41| 0.91| 0.50| 0.70|
## |Ex | 13| 258| 0.09| 1.00| 0.91| 0.96|
## [1] "---------------------------------------------------------------------"
## [1] "FireplaceQu"
##
##
## |FireplaceQu | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Po | 12| 46| 0.02| 0.02| 0.00| 0.01|
## |N/A | 12| 1420| 0.49| 0.50| 0.02| 0.26|
## |Fa | 12| 74| 0.03| 0.53| 0.50| 0.51|
## |TA | 12| 592| 0.20| 0.73| 0.53| 0.63|
## |Gd | 12| 744| 0.25| 0.99| 0.73| 0.86|
## |Ex | 13| 43| 0.01| 1.00| 0.99| 0.99|
## [1] "---------------------------------------------------------------------"
## [1] "OverallQual"
##
##
## |OverallQual | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|---:|----------:|----------------:|-----------------:|-----------:|
## |1 | 11| 4| 0.00| 0.00| 0.00| 0.00|
## |2 | 11| 13| 0.00| 0.01| 0.00| 0.00|
## |3 | 11| 40| 0.01| 0.02| 0.01| 0.01|
## |4 | 12| 226| 0.08| 0.10| 0.02| 0.06|
## |5 | 12| 825| 0.28| 0.38| 0.10| 0.24|
## |6 | 12| 731| 0.25| 0.63| 0.38| 0.50|
## |7 | 12| 600| 0.21| 0.84| 0.63| 0.73|
## |8 | 12| 342| 0.12| 0.95| 0.84| 0.89|
## |9 | 13| 107| 0.04| 0.99| 0.95| 0.97|
## |10 | 13| 31| 0.01| 1.00| 0.99| 0.99|
## [1] "---------------------------------------------------------------------"
## [1] "MasVnrType"
##
##
## |MasVnrType | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |BrkCmn | 12| 25| 0.01| 0.01| 0.00| 0.00|
## |None | 12| 1742| 0.60| 0.61| 0.01| 0.31|
## |BrkFace | 12| 879| 0.30| 0.91| 0.61| 0.76|
## |N/A | 12| 24| 0.01| 0.91| 0.91| 0.91|
## |Stone | 12| 249| 0.09| 1.00| 0.91| 0.96|
## [1] "---------------------------------------------------------------------"
## [1] "GarageType"
##
##
## |GarageType | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |N/A | 11| 157| 0.05| 0.05| 0.00| 0.03|
## |CarPort | 12| 15| 0.01| 0.06| 0.05| 0.06|
## |Detchd | 12| 779| 0.27| 0.33| 0.06| 0.19|
## |2Types | 12| 23| 0.01| 0.33| 0.33| 0.33|
## |Basment | 12| 36| 0.01| 0.35| 0.33| 0.34|
## |Attchd | 12| 1723| 0.59| 0.94| 0.35| 0.64|
## |BuiltIn | 12| 186| 0.06| 1.00| 0.94| 0.97|
## [1] "---------------------------------------------------------------------"
## [1] "HeatingQC"
##
##
## |HeatingQC | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Po | 11| 3| 0.00| 0.00| 0.00| 0.00|
## |Fa | 12| 92| 0.03| 0.03| 0.00| 0.02|
## |TA | 12| 857| 0.29| 0.33| 0.03| 0.18|
## |Gd | 12| 474| 0.16| 0.49| 0.33| 0.41|
## |Ex | 12| 1493| 0.51| 1.00| 0.49| 0.74|
## [1] "---------------------------------------------------------------------"
## [1] "BsmtFinType1"
##
##
## |BsmtFinType1 | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:------------|----------------:|---:|----------:|----------------:|-----------------:|-----------:|
## |N/A | 12| 79| 0.03| 0.03| 0.00| 0.01|
## |Rec | 12| 288| 0.10| 0.13| 0.03| 0.08|
## |BLQ | 12| 269| 0.09| 0.22| 0.13| 0.17|
## |LwQ | 12| 154| 0.05| 0.27| 0.22| 0.24|
## |ALQ | 12| 429| 0.15| 0.42| 0.27| 0.34|
## |Unf | 12| 851| 0.29| 0.71| 0.42| 0.56|
## |GLQ | 12| 849| 0.29| 1.00| 0.71| 0.85|
## [1] "---------------------------------------------------------------------"
## [1] "Neighborhood"
##
##
## |Neighborhood | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:------------|----------------:|---:|----------:|----------------:|-----------------:|-----------:|
## |IDOTRR | 11| 93| 0.03| 0.03| 0.00| 0.02|
## |MeadowV | 11| 37| 0.01| 0.04| 0.03| 0.04|
## |BrDale | 12| 30| 0.01| 0.05| 0.04| 0.05|
## |BrkSide | 12| 108| 0.04| 0.09| 0.05| 0.07|
## |OldTown | 12| 239| 0.08| 0.17| 0.09| 0.13|
## |Edwards | 12| 194| 0.07| 0.24| 0.17| 0.21|
## |Sawyer | 12| 151| 0.05| 0.29| 0.24| 0.27|
## |Blueste | 12| 10| 0.00| 0.30| 0.29| 0.29|
## |SWISU | 12| 48| 0.02| 0.31| 0.30| 0.30|
## |NPkVill | 12| 23| 0.01| 0.32| 0.31| 0.32|
## |NAmes | 12| 443| 0.15| 0.47| 0.32| 0.40|
## |Mitchel | 12| 114| 0.04| 0.51| 0.47| 0.49|
## |SawyerW | 12| 125| 0.04| 0.55| 0.51| 0.53|
## |NWAmes | 12| 131| 0.04| 0.60| 0.55| 0.58|
## |Gilbert | 12| 165| 0.06| 0.65| 0.60| 0.63|
## |CollgCr | 12| 267| 0.09| 0.75| 0.65| 0.70|
## |Blmngtn | 12| 28| 0.01| 0.76| 0.75| 0.75|
## |Crawfor | 12| 103| 0.04| 0.79| 0.76| 0.77|
## |ClearCr | 12| 44| 0.02| 0.81| 0.79| 0.80|
## |Somerst | 12| 182| 0.06| 0.87| 0.81| 0.84|
## |Veenker | 12| 24| 0.01| 0.88| 0.87| 0.87|
## |Timber | 12| 72| 0.02| 0.90| 0.88| 0.89|
## |StoneBr | 13| 51| 0.02| 0.92| 0.90| 0.91|
## |NridgHt | 13| 166| 0.06| 0.98| 0.92| 0.95|
## |NoRidge | 13| 71| 0.02| 1.00| 0.98| 0.99|
## [1] "---------------------------------------------------------------------"
## [1] "LotShape"
##
##
## |LotShape | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:--------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Reg | 12| 1859| 0.64| 0.64| 0.00| 0.32|
## |IR1 | 12| 968| 0.33| 0.97| 0.64| 0.80|
## |IR3 | 12| 16| 0.01| 0.97| 0.97| 0.97|
## |IR2 | 12| 76| 0.03| 1.00| 0.97| 0.99|
## [1] "---------------------------------------------------------------------"
## [1] "HouseStyle"
##
##
## |HouseStyle | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |1.5Unf | 12| 19| 0.01| 0.01| 0.00| 0.00|
## |SFoyer | 12| 83| 0.03| 0.03| 0.01| 0.02|
## |1.5Fin | 12| 314| 0.11| 0.14| 0.03| 0.09|
## |2.5Unf | 12| 24| 0.01| 0.15| 0.14| 0.15|
## |1Story | 12| 1471| 0.50| 0.65| 0.15| 0.40|
## |SLvl | 12| 128| 0.04| 0.70| 0.65| 0.68|
## |2Story | 12| 872| 0.30| 1.00| 0.70| 0.85|
## |2.5Fin | 12| 8| 0.00| 1.00| 1.00| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "Foundation"
##
##
## |Foundation | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |Slab | 12| 49| 0.02| 0.02| 0.00| 0.01|
## |BrkTil | 12| 311| 0.11| 0.12| 0.02| 0.07|
## |CBlock | 12| 1235| 0.42| 0.55| 0.12| 0.33|
## |Stone | 12| 11| 0.00| 0.55| 0.55| 0.55|
## |Wood | 12| 5| 0.00| 0.55| 0.55| 0.55|
## |PConc | 12| 1308| 0.45| 1.00| 0.55| 0.78|
## [1] "---------------------------------------------------------------------"
## [1] "MSSubClass"
##
##
## |MSSubClass | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |30 | 11| 139| 0.05| 0.05| 0.00| 0.02|
## |180 | 12| 17| 0.01| 0.05| 0.05| 0.05|
## |45 | 12| 18| 0.01| 0.06| 0.05| 0.06|
## |190 | 12| 61| 0.02| 0.08| 0.06| 0.07|
## |90 | 12| 109| 0.04| 0.12| 0.08| 0.10|
## |160 | 12| 128| 0.04| 0.16| 0.12| 0.14|
## |50 | 12| 287| 0.10| 0.26| 0.16| 0.21|
## |40 | 12| 6| 0.00| 0.26| 0.26| 0.26|
## |85 | 12| 48| 0.02| 0.28| 0.26| 0.27|
## |70 | 12| 128| 0.04| 0.32| 0.28| 0.30|
## |80 | 12| 118| 0.04| 0.36| 0.32| 0.34|
## |20 | 12| 1079| 0.37| 0.73| 0.36| 0.55|
## |75 | 12| 23| 0.01| 0.74| 0.73| 0.74|
## |120 | 12| 182| 0.06| 0.80| 0.74| 0.77|
## |60 | 12| 575| 0.20| 1.00| 0.80| 0.90|
## |150 | NaN| 1| 0.00| 1.00| 1.00| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "OverallCond"
##
##
## |OverallCond | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |1 | 11| 7| 0.00| 0.00| 0.00| 0.00|
## |3 | 11| 50| 0.02| 0.02| 0.00| 0.01|
## |2 | 12| 10| 0.00| 0.02| 0.02| 0.02|
## |4 | 12| 101| 0.03| 0.06| 0.02| 0.04|
## |6 | 12| 531| 0.18| 0.24| 0.06| 0.15|
## |8 | 12| 144| 0.05| 0.29| 0.24| 0.26|
## |7 | 12| 390| 0.13| 0.42| 0.29| 0.36|
## |5 | 12| 1645| 0.56| 0.99| 0.42| 0.70|
## |9 | 12| 41| 0.01| 1.00| 0.99| 0.99|
## [1] "---------------------------------------------------------------------"
## [1] "Exterior1st"
##
##
## |Exterior1st | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |BrkComm | 11| 6| 0.00| 0.00| 0.00| 0.00|
## |AsphShn | 12| 2| 0.00| 0.00| 0.00| 0.00|
## |AsbShng | 12| 44| 0.02| 0.02| 0.00| 0.01|
## |CBlock | 12| 2| 0.00| 0.02| 0.02| 0.02|
## |Wd Sdng | 12| 411| 0.14| 0.16| 0.02| 0.09|
## |WdShing | 12| 56| 0.02| 0.18| 0.16| 0.17|
## |MetalSd | 12| 450| 0.15| 0.33| 0.18| 0.26|
## |Stucco | 12| 43| 0.01| 0.35| 0.33| 0.34|
## |HdBoard | 12| 442| 0.15| 0.50| 0.35| 0.42|
## |Plywood | 12| 221| 0.08| 0.57| 0.50| 0.54|
## |BrkFace | 12| 87| 0.03| 0.60| 0.57| 0.59|
## |CemntBd | 12| 126| 0.04| 0.65| 0.60| 0.63|
## |VinylSd | 12| 1025| 0.35| 1.00| 0.65| 0.82|
## |Stone | 12| 2| 0.00| 1.00| 1.00| 1.00|
## |ImStucc | 12| 1| 0.00| 1.00| 1.00| 1.00|
## |N/A | NaN| 1| 0.00| 1.00| 1.00| 1.00|
## [1] "---------------------------------------------------------------------"
## [1] "Exterior2nd"
##
##
## |Exterior2nd | BoxCox_SalePrice| n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|----------------:|----:|----------:|----------------:|-----------------:|-----------:|
## |CBlock | 12| 3| 0.00| 0.00| 0.00| 0.00|
## |AsbShng | 12| 38| 0.01| 0.01| 0.00| 0.01|
## |Brk Cmn | 12| 22| 0.01| 0.02| 0.01| 0.02|
## |AsphShn | 12| 4| 0.00| 0.02| 0.02| 0.02|
## |Wd Sdng | 12| 391| 0.13| 0.16| 0.02| 0.09|
## |Stucco | 12| 47| 0.02| 0.17| 0.16| 0.16|
## |MetalSd | 12| 447| 0.15| 0.33| 0.17| 0.25|
## |Wd Shng | 12| 81| 0.03| 0.35| 0.33| 0.34|
## |Stone | 12| 6| 0.00| 0.36| 0.35| 0.35|
## |HdBoard | 12| 406| 0.14| 0.50| 0.36| 0.43|
## |Plywood | 12| 270| 0.09| 0.59| 0.50| 0.54|
## |BrkFace | 12| 47| 0.02| 0.60| 0.59| 0.60|
## |CmentBd | 12| 126| 0.04| 0.65| 0.60| 0.63|
## |VinylSd | 12| 1014| 0.35| 0.99| 0.65| 0.82|
## |ImStucc | 12| 15| 0.01| 1.00| 0.99| 1.00|
## |Other | 13| 1| 0.00| 1.00| 1.00| 1.00|
## |N/A | NaN| 1| 0.00| 1.00| 1.00| 1.00|
summary(
object = M
)
## Id MSSubClass MSZoning lot_frontage lot_area
## Length:2919 Min. :0.00 Min. :0.00 Min. : 21 Min. : 46
## Class :character 1st Qu.:0.00 1st Qu.:1.00 1st Qu.: 60 1st Qu.: 86
## Mode :character Median :1.00 Median :1.00 Median : 70 Median : 95
## Mean :0.64 Mean :0.78 Mean : 70 Mean : 94
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.: 80 3rd Qu.:103
## Max. :1.00 Max. :1.00 Max. :136 Max. :158
##
## Street Alley LotShape LandContour Utilities
## Min. :0 Min. :0.00 Min. :0.00 Min. :0.0 Min. :0
## 1st Qu.:1 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:1.0 1st Qu.:1
## Median :1 Median :1.00 Median :0.00 Median :1.0 Median :1
## Mean :1 Mean :0.93 Mean :0.36 Mean :0.9 Mean :1
## 3rd Qu.:1 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.0 3rd Qu.:1
## Max. :1 Max. :1.00 Max. :1.00 Max. :1.0 Max. :1
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:1.00 1st Qu.:1.00
## Median :1.00 Median :1.00 Median :0.00 Median :1.00 Median :1.00
## Mean :0.73 Mean :0.95 Mean :0.49 Mean :0.86 Mean :0.99
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.00
##
## BldgType HouseStyle OverallQual OverallCond year_built
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :1805000
## 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:1908081
## Median :1.00 Median :0.00 Median :1.00 Median :1.00 Median :1946364
## Mean :0.83 Mean :0.35 Mean :0.62 Mean :0.58 Mean :1943692
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:2002000
## Max. :1.00 Max. :1.00 Max. :1.00 Max. :1.00 Max. :2016032
##
## year_remod_add RoofStyle RoofMatl Exterior1st Exterior2nd
## Min. :1901250 Min. :0.00 Min. :0.00 Min. :0.0 Min. :0.0
## 1st Qu.:1930612 1st Qu.:1.00 1st Qu.:1.00 1st Qu.:0.0 1st Qu.:0.0
## Median :1986024 Median :1.00 Median :1.00 Median :1.0 Median :1.0
## Mean :1968862 Mean :0.79 Mean :0.99 Mean :0.5 Mean :0.5
## 3rd Qu.:2008008 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.0 3rd Qu.:1.0
## Max. :2018040 Max. :1.00 Max. :1.00 Max. :1.0 Max. :1.0
##
## MasVnrType mas_vnr_area ExterQual ExterCond Foundation
## Min. :0.00 Min. :0.0 Min. :0.00 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:1.00 1st Qu.:0.00
## Median :0.00 Median :0.0 Median :0.00 Median :1.00 Median :0.00
## Mean :0.39 Mean :1.1 Mean :0.37 Mean :0.87 Mean :0.45
## 3rd Qu.:1.00 3rd Qu.:2.9 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :3.2 Max. :1.00 Max. :1.00 Max. :1.00
##
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 bsmt_fin_sf1
## Min. :0.0 Min. :0.00 Min. :0.00 Min. :0.00 Min. : 0.0
## 1st Qu.:0.0 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.: 0.0
## Median :1.0 Median :1.00 Median :1.00 Median :1.00 Median :12.5
## Mean :0.5 Mean :0.89 Mean :0.65 Mean :0.58 Mean : 9.5
## 3rd Qu.:1.0 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:15.4
## Max. :1.0 Max. :1.00 Max. :1.00 Max. :1.00 Max. :19.4
##
## BsmtFinType2 bsmt_fin_sf2 bsmt_unf_sf total_bsmt_sf Heating
## Min. :0.00 Min. :0.00 Min. : 0 Min. : 0 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:25 1st Qu.:304 1st Qu.:1.00
## Median :1.00 Median :0.00 Median :36 Median :365 Median :1.00
## Mean :0.85 Mean :0.08 Mean :36 Mean :378 Mean :0.98
## 3rd Qu.:1.00 3rd Qu.:0.00 3rd Qu.:48 3rd Qu.:459 3rd Qu.:1.00
## Max. :1.00 Max. :0.68 Max. :70 Max. :709 Max. :1.00
##
## HeatingQC CentralAir Electrical x1st_flr_sf x2nd_flr_sf
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :6.3 Min. :0.0
## 1st Qu.:0.00 1st Qu.:1.00 1st Qu.:1.00 1st Qu.:6.8 1st Qu.:0.0
## Median :1.00 Median :1.00 Median :1.00 Median :7.0 Median :0.0
## Mean :0.51 Mean :0.93 Mean :0.92 Mean :7.0 Mean :1.9
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:7.2 3rd Qu.:4.4
## Max. :1.00 Max. :1.00 Max. :1.00 Max. :7.7 Max. :4.6
##
## low_qual_fin_sf gr_liv_area bsmt_full_bath bsmt_half_bath full_bath
## Min. : 0 Min. :6.5 Min. :0.00 Min. :0.00 Min. :1.00
## 1st Qu.: 0 1st Qu.:7.0 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:1.00
## Median : 0 Median :7.3 Median :0.00 Median :0.00 Median :2.00
## Mean : 2 Mean :7.3 Mean :0.43 Mean :0.06 Mean :1.57
## 3rd Qu.: 0 3rd Qu.:7.5 3rd Qu.:1.00 3rd Qu.:0.00 3rd Qu.:2.00
## Max. :154 Max. :8.0 Max. :2.00 Max. :1.00 Max. :3.00
##
## half_bath bedroom_abv_gr kitchen_abv_gr KitchenQual tot_rms_abv_grd
## Min. :0.00 Min. :1.0 Min. :1.00 Min. :0.00 Min. :1.39
## 1st Qu.:0.00 1st Qu.:2.0 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:1.61
## Median :0.00 Median :3.0 Median :1.00 Median :0.00 Median :1.79
## Mean :0.37 Mean :2.9 Mean :1.04 Mean :0.46 Mean :1.84
## 3rd Qu.:1.00 3rd Qu.:3.0 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.95
## Max. :1.00 Max. :5.0 Max. :2.00 Max. :1.00 Max. :2.40
##
## Functional fireplaces FireplaceQu GarageType garage_yr_blt
## Min. :0.00 Min. :0.00 Min. :0.0 Min. :0.00 Min. :1833612
## 1st Qu.:1.00 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:1916882
## Median :1.00 Median :1.00 Median :0.0 Median :1.00 Median :1954264
## Mean :0.93 Mean :0.59 Mean :0.5 Mean :0.65 Mean :1953346
## 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:1.0 3rd Qu.:1.00 3rd Qu.:2002000
## Max. :1.00 Max. :2.00 Max. :1.0 Max. :1.00 Max. :2018040
##
## GarageFinish garage_cars garage_area GarageQual GarageCond
## Min. :0.00 Min. :0.00 Min. : 0 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:1.00 1st Qu.: 320 1st Qu.:1.00 1st Qu.:1.00
## Median :1.00 Median :2.00 Median : 480 Median :1.00 Median :1.00
## Mean :0.52 Mean :1.76 Mean : 471 Mean :0.89 Mean :0.91
## 3rd Qu.:1.00 3rd Qu.:2.00 3rd Qu.: 576 3rd Qu.:1.00 3rd Qu.:1.00
## Max. :1.00 Max. :3.00 Max. :1019 Max. :1.00 Max. :1.00
##
## PavedDrive wood_deck_sf open_porch_sf enclosed_porch x3ssn_porch
## Min. :0.0 Min. :0.0 Min. :0.0 Min. :0.00 Min. : 0
## 1st Qu.:1.0 1st Qu.:0.0 1st Qu.:0.0 1st Qu.:0.00 1st Qu.: 0
## Median :1.0 Median :0.0 Median :3.4 Median :0.00 Median : 0
## Mean :0.9 Mean :2.0 Mean :2.5 Mean :0.12 Mean : 2
## 3rd Qu.:1.0 3rd Qu.:4.2 3rd Qu.:4.5 3rd Qu.:0.00 3rd Qu.: 0
## Max. :1.0 Max. :4.9 Max. :6.1 Max. :0.77 Max. :144
##
## screen_porch PoolQC Fence MiscFeature misc_val
## Min. : 0 Min. :0 Min. :0.0 Min. :0.00 Min. : 0
## 1st Qu.: 0 1st Qu.:1 1st Qu.:1.0 1st Qu.:1.00 1st Qu.: 0
## Median : 0 Median :1 Median :1.0 Median :1.00 Median : 0
## Mean : 15 Mean :1 Mean :0.8 Mean :0.96 Mean : 23
## 3rd Qu.: 0 3rd Qu.:1 3rd Qu.:1.0 3rd Qu.:1.00 3rd Qu.: 0
## Max. :260 Max. :1 Max. :1.0 Max. :1.00 Max. :982
##
## mo_sold yr_sold SaleType SaleCondition SalePrice
## Min. : 1.0 Min. :2006 Min. :0.00 Min. :0.00 Min. : 34900
## 1st Qu.: 4.0 1st Qu.:2007 1st Qu.:1.00 1st Qu.:1.00 1st Qu.:129975
## Median : 6.0 Median :2008 Median :1.00 Median :1.00 Median :163000
## Mean : 6.2 Mean :2008 Mean :0.87 Mean :0.82 Mean :180921
## 3rd Qu.: 8.0 3rd Qu.:2009 3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.:214000
## Max. :12.0 Max. :2010 Max. :1.00 Max. :1.00 Max. :755000
## NA's :1459
## TrainTest Bin BoxCox_SalePrice
## Length:2919 [3.49e+04,1.4e+05]: 487 Min. :10
## Class :character (1.4e+05,1.9e+05] : 490 1st Qu.:12
## Mode :character (1.9e+05,7.55e+05]: 483 Median :12
## NA's :1459 Mean :12
## 3rd Qu.:12
## Max. :14
## NA's :1459
write.csv(
x = M,
file = "/Users/thienpham/Data Mining/data/prepared_house_prices.csv",
row.names = FALSE
)
Take your data for regression supervised learning and prepare your predictors and your target variable. You do not need to perform feature selection yet, we will do that in an upcoming assignment.